import pandas as pd
import numpy as np
import project_tests as t
import re
from IPython.display import display
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
import matplotlib as plt
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords'])
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
init_notebook_mode(connected = True)
pd.set_option('display.max_colwidth', 200)
# load the datasets
df = pd.read_csv('data/user-item-interactions.csv')
df_content = pd.read_csv('data/articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']
# Show df to get an idea of the data
df.head()
# Show data layout
df.info()
# Show df_content to get an idea of the data
df_content.head()
#Show data layout
df_content.info()
1. What is the distribution of how many articles a user interacts with in the dataset? Provide a visual and descriptive statistics to assist with giving a look at the number of times each user interacts with an article.
# How many viewers for each article ?
article_read = df.groupby('article_id').email.count().reset_index()
article_read.rename(columns={'email': 'readers'}, inplace=True)
print('Summary of viewers per article')
print('------------------------------------------------------')
display(article_read['readers'].describe())
# How many articles a user reads ?
user_read = df.groupby('email').article_id.count().reset_index()
user_read.rename(columns={'article_id': 'articles'}, inplace=True)
print('Distribution of articles viewed per user')
print('------------------------------------------------------')
display(user_read['articles'].describe())
# Define the threshold for the trailing bin
threshold = 300
# Adjust the data for the trailing bin
x1 = article_read['readers'].apply(lambda x: threshold if x >= threshold else x)
trace1 =go.Histogram(
x=x1,
opacity=0.75,
name='Views per Article',
xbins=dict( # bins used for histogram
start=0,
end=threshold,
size=5 ),
marker= dict(color='red')
)
# Adjust the data for the trailing bin
x2 = user_read['articles'].apply(lambda x: threshold if x >= threshold else x)
trace2 =go.Histogram(
x=x2,
opacity=0.75,
name='Articles per User',
xbins=dict( # bins used for histogram
start=0,
end=threshold,
size=5 ),
marker= dict(color='darkblue')
)
fig = tools.make_subplots(rows=1, cols=2)
fig.append_trace(trace1, 1,1)
fig.append_trace(trace2, 1,2)
fig['layout'].update(
height = 500,
width = 900,
title ='User x Article Interactions',
titlefont = dict(size=13),
margin=dict(l=50, r=30, t=40, b=50),
bargap=0.1,
yaxis1 = {
'title': 'Frequency',
'titlefont': dict(size=12),
'tickfont' : dict(size=10),
'gridwidth': None
},
xaxis1 = {
'title': 'No of Times an Article is Viewed',
'titlefont': dict(size=10)
},
yaxis2 = {
#'title': 'Frequency',
'titlefont': dict(size=12),
'tickfont' : dict(size=10),
'gridwidth': None
},
xaxis2 = {
'title': 'No of Articles a User Views',
'titlefont': dict(size=10)
},
legend=dict(
x=0.8, # Horizontally center the legend
y=0.8, # Adjust vertical position as needed
xanchor='center', # Ensure that x=0.5 is the center of the legend
yanchor='middle' # Ensure that y=0.5 is the middle of the legend
)
)
iplot(fig)
# Fill in the median and maximum number of user_article interactios below
# 50% of individuals interact with ____ number of articles or fewer.
median_val = user_read['articles'].median()
print(f'50% of individuals interact with {median_val} number of articles or fewer')
# The maximum number of user-article interactions by any 1 user is _____
max_views_by_user = user_read['articles'].max()
print(f'The maximum number of user-article interactions by any 1 user is {max_views_by_user}')
2. Explore and remove duplicate articles from the df_content dataframe.
# Find and explore duplicate articles
print('Duplicate records in df_content')
df_content.loc[df_content['article_id'].duplicated(keep=False)].sort_values(by=['article_id'])
# Remove any rows that have the same article_id - only keep the first
print('Original Size:', df_content.shape)
df_content = df_content.drop_duplicates(subset='article_id', keep='first')
print('Size after deduped:', df_content.shape)
3. Use the cells below to find:
a. The number of unique articles that have an interaction with a user.
b. The number of unique articles in the dataset (whether they have any interactions or not).
c. The number of unique users in the dataset. (excluding null values)
d. The number of user-article interactions in the dataset.
unique_articles = df['article_id'].nunique()
print('The number of unique articles that have at least one interaction:', unique_articles)
total_articles = df_content['article_id'].nunique()
print('The number of unique articles listed on the community file:', total_articles)
unique_users = df['email'].nunique()
print('The number of unique users:', unique_users)
user_article_interactions = df.shape[0]
print('The number of user-article interactions:', user_article_interactions)
4. Use the cells below to find the most viewed article_id, as well as how often it was viewed. After talking to the company leaders, the email_mapper function was deemed a reasonable way to map users to ids. There were a small number of null values, and it was found that all of these null values likely belonged to a single user (which is how they are stored using the function below).
# index of article id with most read/viewed by users
max_index = article_read['readers'].idxmax()
print('Index of article with most views:', max_index)
# The most viewed article in the dataset as a string with one value following the decimal
most_viewed_article_id = str(article_read.loc[max_index, 'article_id'])
# The most viewed article in the dataset was viewed how many times?
max_views = article_read.loc[max_index, 'readers']
print(f'Most viewed article {most_viewed_article_id}, Maximum view {max_views}')
## No need to change the code here - this will be helpful for later parts of the notebook
# Run this cell to map the user email to a user_id column and remove the email column
def email_mapper():
coded_dict = dict()
cter = 1
email_encoded = []
for val in df['email']:
if val not in coded_dict:
coded_dict[val] = cter
cter+=1
email_encoded.append(coded_dict[val])
return email_encoded
email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded
# show header
df.head()
## If you stored all your results in the variable names above,
## you shouldn't need to change anything in this cell
sol_1_dict = {
'`50% of individuals have _____ or fewer interactions.`': median_val,
'`The total number of user-article interactions in the dataset is ______.`': user_article_interactions,
'`The maximum number of user-article interactions by any 1 user is ______.`': max_views_by_user,
'`The most viewed article in the dataset was viewed _____ times.`': max_views,
'`The article_id of the most viewed article is ______.`': most_viewed_article_id,
'`The number of unique articles that have at least 1 rating ______.`': unique_articles,
'`The number of unique users in the dataset is ______`': unique_users,
'`The number of unique articles on the IBM platform`': total_articles
}
# Test your dictionary against the solution
t.sol_1_test(sol_1_dict)
Unlike in the earlier lessons, we don't actually have ratings for whether a user liked an article or not. We only know that a user has interacted with an article. In these cases, the popularity of an article can really only be based on how often an article was interacted with.
1. Fill in the function below to return the n top articles ordered with most interactions as the top. Test your function using the tests below.
def get_top_articles(n, df=df):
'''
INPUT:
n - (int) the number of top articles to return
df - (pandas dataframe) df as defined at the top of the notebook
OUTPUT:
top_articles - (list) A list of the top 'n' article titles
'''
# Find top article_ids (this avoids grouping articles with identical titles)
articles_cnt = list(df.groupby('article_id')['user_id'].count().sort_values(ascending=False).index)
top_id = articles_cnt[:n]
# Find titles of those articles
top_articles = df.loc[df.article_id.isin(top_id),['article_id','title']].drop_duplicates()
top_articles = list(top_articles.title)
return top_articles # Return the top article titles from df (not df_content)
def get_top_article_ids(n, df=df):
'''
INPUT:
n - (int) the number of top articles to return
df - (pandas dataframe) df as defined at the top of the notebook
OUTPUT:
top_articles - (list) A list of the top 'n' article titles
'''
# Find top article_ids (this avoids grouping articles with identical titles)
articles_cnt = list(df.groupby('article_id')['user_id'].count().sort_values(ascending=False).index)
top_articles_id = articles_cnt[:n]
return list(map(str, top_articles_id)) # Return the top article ids
# Top Article IDs, Articles
print('Popular Articles')
print('------------------------------------------------------')
for id, title in zip(get_top_article_ids(5), get_top_articles(5)):
print(id, title.title())
# Test your function by returning the top 5, 10, and 20 articles
top_5 = get_top_articles(5)
top_10 = get_top_articles(10)
top_20 = get_top_articles(20)
# Test each of your three lists from above
t.sol_2_test(get_top_articles)
1. Use the function below to reformat the df dataframe to be shaped with users as the rows and articles as the columns.
Use the tests to make sure the basic structure of your matrix matches what is expected by the solution.
# create the user-article matrix with 1's and 0's
def create_user_item_matrix(df):
'''
INPUT:
df - pandas dataframe with article_id, title, user_id columns
OUTPUT:
user_item - user item matrix
Description:
Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with
an article and a 0 otherwise
'''
user_item = df.groupby(['user_id','article_id'])['user_id'].nunique().unstack()
user_item.fillna(0, inplace=True)
return user_item # return the user_item matrix
user_item = create_user_item_matrix(df)
print('User Item matrix:', user_item.shape)
## Tests: You should just need to run this cell. Don't change the code.
assert user_item.shape[0] == 5149, "Oops! The number of users in the user-article matrix doesn't look right."
assert user_item.shape[1] == 714, "Oops! The number of articles in the user-article matrix doesn't look right."
assert user_item.sum(axis=1)[1] == 36, "Oops! The number of articles seen by user 1 doesn't look right."
print("You have passed our quick tests! Please proceed!")
2. Complete the function below which should take a user_id and provide an ordered list of the most similar users to that user (from most similar to least similar). The returned result should not contain the provided user_id, as we know that each user is similar to him/herself. Because the results for each user here are binary, it (perhaps) makes sense to compute similarity as the dot product of two users.
Use the tests to test your function.
def find_similar_users(user_id, user_item=user_item, return_similarity=False):
'''
INPUT:
user_id - (int) a user_id
user_item - (pandas dataframe) matrix of users by articles:
1's when a user has interacted with an article, 0 otherwise
OUTPUT:
similar_users - (list) an ordered list where the closest users (largest dot product users)
are listed first
Description:
Computes the similarity of every pair of users based on the dot product
Returns an ordered
'''
# compute similarity of each user to the provided user
user_similarity = np.dot(user_item.loc[user_id], user_item.T)
# Sort similarities in descending order and get the corresponding user IDs
# Exclude the similarity with itself
sorted_ids = np.argsort(user_similarity)[::-1]
most_similar_users = list(user_item.index[sorted_ids][1:])
sorted_similarity = list(user_similarity[sorted_ids][1:])
if return_similarity:
# return a list of the users in order from most to least similar
return most_similar_users, sorted_similarity
else:
return most_similar_users
# Do a spot check of your function
print("The 10 most similar users to user 1 are: {}".format(find_similar_users(1)[:10]))
print("The 5 most similar users to user 3933 are: {}".format(find_similar_users(3933)[:5]))
print("The 3 most similar users to user 46 are: {}".format(find_similar_users(46)[:3]))
3. Now that you have a function that provides the most similar users to each user, you will want to use these users to find articles you can recommend. Complete the functions below to return the articles you would recommend to each user.
def get_article_names(article_ids, df=df):
'''
INPUT:
article_ids - (list) a list of article ids
df - (pandas dataframe) df as defined at the top of the notebook
OUTPUT:
article_names - (list) a list of article names associated with the list of article ids
(this is identified by the title column)
'''
article_names= list(df[df['article_id'].isin(article_ids)]['title'].unique())
return article_names # Return the article names associated with list of article ids
def get_user_articles(user_id, user_item=user_item):
'''
INPUT:
user_id - (int) a user id
user_item - (pandas dataframe) matrix of users by articles:
1's when a user has interacted with an article, 0 otherwise
OUTPUT:
article_ids - (list) a list of the article ids seen by the user
article_names - (list) a list of article names associated with the list of article ids
(this is identified by the doc_full_name column in df_content)
Description:
Provides a list of the article_ids and article titles that have been seen by a user
'''
# Fetching article ids where the user interaction is 1 (has interacted)
article_ids = user_item.loc[user_id, user_item.loc[user_id] == 1].index.tolist()
article_names = get_article_names(article_ids)
return list(map(str, article_ids)), article_names
def user_user_recs(user_id, m=10):
'''
INPUT:
user_id - (int) a user id
m - (int) the number of recommendations you want for the user
OUTPUT:
recs - (list) a list of recommendations for the user
Description:
Loops through the users based on closeness to the input user_id
For each user - finds articles the user hasn't seen before and provides them as recs
Does this until m recommendations are found
Notes:
Users who are the same closeness are chosen arbitrarily as the 'next' user
For the user where the number of recommended articles starts below m
and ends exceeding m, the last items are chosen arbitrarily
'''
similar_users = find_similar_users(user_id, user_item)
articles_seen_ids = set(get_user_articles(user_id, user_item)[0])
rec_ids = set()
for user in similar_users:
articles_rec_ids, _ = get_user_articles(user, user_item)
new_recs = set(articles_rec_ids) - articles_seen_ids
rec_ids.update(new_recs)
if len(rec_ids) > m:
break
rec_names = get_article_names(list(rec_ids)[:m], df)
return rec_names
# Quick Spot Check
print('User 20 Viewed Articles')
print('------------------------------------------------------')
_, user_articles = get_user_articles(20, user_item=user_item)
for title in user_articles:
print(title.title())
print()
print("The top 10 recommendations for User 20")
print('------------------------------------------------------')
rec_names = user_user_recs(20, 10)
for title in rec_names:
print(title.title())
# Test your functions here - No need to change this code - just run this cell
assert set(get_article_names(['1024.0', '1176.0', '1305.0', '1314.0', '1422.0', '1427.0'])) == set(['using deep learning to reconstruct high-resolution audio', 'build a python app on the streaming analytics service', 'gosales transactions for naive bayes model', 'healthcare python streaming application demo', 'use r dataframes & ibm watson natural language understanding', 'use xgboost, scikit-learn & ibm watson machine learning apis']), "Oops! Your the get_article_names function doesn't work quite how we expect."
assert set(get_article_names(['1320.0', '232.0', '844.0'])) == set(['housing (2015): united states demographic measures','self-service data preparation with ibm data refinery','use the cloudant-spark connector in python notebook']), "Oops! Your the get_article_names function doesn't work quite how we expect."
assert set(get_user_articles(20)[0]) == set(['1320.0', '232.0', '844.0'])
assert set(get_user_articles(20)[1]) == set(['housing (2015): united states demographic measures', 'self-service data preparation with ibm data refinery','use the cloudant-spark connector in python notebook'])
assert set(get_user_articles(2)[0]) == set(['1024.0', '1176.0', '1305.0', '1314.0', '1422.0', '1427.0'])
assert set(get_user_articles(2)[1]) == set(['using deep learning to reconstruct high-resolution audio', 'build a python app on the streaming analytics service', 'gosales transactions for naive bayes model', 'healthcare python streaming application demo', 'use r dataframes & ibm watson natural language understanding', 'use xgboost, scikit-learn & ibm watson machine learning apis'])
print("If this is all you see, you passed all of our tests! Nice job!")
4. Now we are going to improve the consistency of the user_user_recs function from above.
def get_top_sorted_users(user_id, df=df, user_item=user_item):
'''
INPUT:
user_id - (int)
df - (pandas dataframe) df as defined at the top of the notebook
user_item - (pandas dataframe) matrix of users by articles:
1's when a user has interacted with an article, 0 otherwise
OUTPUT:
neighbors_df - (pandas dataframe) a dataframe with:
neighbor_id - is a neighbor user_id
similarity - measure of the similarity of each user to the provided user_id
num_interactions - the number of articles viewed by the user
Other Details - sort the neighbors_df by the similarity and then by number of interactions where
highest of each is higher in the dataframe
'''
# Find similar users and similarity scores
neighbor_id, neighbor_similarity = find_similar_users(user_id,user_item,True)
# Find number of interactions of each user (non-unique)
neighbor_id_count = df.groupby('user_id').size().reset_index(name='num_interactions').rename(columns={'user_id': 'neighbor_id'})
# Create DataFrame
neighbors_df = pd.DataFrame({'neighbor_id': neighbor_id, 'similarity': neighbor_similarity})
neighbors_df = neighbors_df.merge(neighbor_id_count, on='neighbor_id')
# Sorting by similarity and then by number of interactions
neighbors_df.sort_values(by=['similarity', 'num_interactions'], inplace=True, ascending=[False, False])
return neighbors_df
def user_user_recs_part2(user_id, m=10):
'''
INPUT:
user_id - (int) a user id
m - (int) the number of recommendations you want for the user
OUTPUT:
recs - (list) a list of recommendations for the user by article id
rec_names - (list) a list of recommendations for the user by article title
Description:
Loops through the users based on closeness to the input user_id
For each user - finds articles the user hasn't seen before and provides them as recs
Does this until m recommendations are found
Notes:
* Choose the users that have the most total article interactions
before choosing those with fewer article interactions.
* Choose articles with the articles with the most total interactions
before choosing those with fewer total interactions.
'''
similar_users = get_top_sorted_users(user_id)
articles_seen_ids = set(get_user_articles(user_id, user_item)[0])
rec_ids = set()
for index, row in similar_users.iterrows():
articles_rec_ids, _ = get_user_articles(row['neighbor_id'], user_item)
new_recs = set(articles_rec_ids) - articles_seen_ids
rec_ids.update(new_recs)
if len(rec_ids) > m:
break
rec_names = get_article_names(list(rec_ids)[:m], df)
return rec_ids, rec_names
# Quick spot check - don't change this code - just use it to test your functions
print('User 20 Viewed Articles')
print('------------------------------------------------------')
_, user_articles = get_user_articles(20, user_item=user_item)
for title in user_articles:
print(title.title())
print()
print("The top 10 recommendations for User 20")
print('------------------------------------------------------')
rec_ids, rec_names = user_user_recs_part2(20, 10)
for id, title in zip(rec_ids, rec_names):
print(id, title.title())
5. Use your functions from above to correctly fill in the solutions to the dictionary below. Then test your dictionary against the solution. Provide the code you need to answer each following the comments below.
### Tests with a dictionary of results
# Find the user that is most similar to user 1
user1_most_sim = get_top_sorted_users(1).neighbor_id[0]
# Find the 10th most similar user to user 131
user131_10th_sim = get_top_sorted_users(131).neighbor_id[9]
## Dictionary Test Here
sol_5_dict = {
'The user that is most similar to user 1.': user1_most_sim,
'The user that is the 10th most similar to user 131': user131_10th_sim,
}
t.sol_5_test(sol_5_dict)
6. If we were given a new user, which of the above functions would you be able to use to make recommendations? Explain. Can you think of a better way we might make recommendations? Use the cell below to explain a better method for new users.
Provide your response here.
If given a new user (often referred to as a "cold start" problem in recommendation systems), the traditional user-user collaborative filtering method (which relies on the user's past interactions) is not applicable because the new user has no prior interactions or history. Therefore, of the functions discussed earlier:
get_user_articles: This function wouldn't be useful for a new user because it requires a user's past interactions to recommend articles.
user_user_recs: This function also relies on finding similar users based on past interactions, so it wouldn't be applicable for a new user.
For new users, a different approach is needed. Here are some common strategies:
7. Using your existing functions, provide the top 10 recommended articles you would provide for the a new user below. You can test your function against our thoughts to make sure we are all on the same page with how we might make a recommendation.
new_user = '0.0'
# What would your recommendations be for this new user '0.0'? As a new user, they have no observed articles.
# Provide a list of the top 10 article ids you would give to
# Your recommendations here
new_user_recs = get_top_article_ids(10)
print(new_user_recs)
assert set(new_user_recs) == set(['1314.0','1429.0','1293.0','1427.0','1162.0','1364.0','1304.0','1170.0','1431.0','1330.0']), "Oops! It makes sense that in this case we would want to recommend the most popular articles, because we don't know anything about these users."
print("That's right! Nice job!")
Another method we might use to make recommendations is to perform a ranking of the highest ranked articles associated with some term. You might consider content to be the doc_body, doc_description, or doc_full_name. There isn't one way to create a content based recommendation, especially considering that each of these columns hold content related information.
1. Use the function body below to create a content based recommender. Since there isn't one right answer for this recommendation tactic, no test functions are provided. Feel free to change the function inputs if you decide you want to try a method that requires more input values. The input values are currently set with one idea in mind that you may use to make content based recommendations. One additional idea is that you might want to choose the most popular recommendations that meet your 'content criteria', but again, there is a lot of flexibility in how you might make these recommendations.
Note: This part is NOT REQUIRED to pass this project. However, you may choose to take this on as an extra way to show off your skills.
Steps to constructing content-based recommendations:
Constructing a Similarity Matrix for Article Titles via NLP TF-IDF
TF-IDF, short for "Term Frequency-Inverse Document Frequency," is used to assign numerical importance scores to words in documents. These scores are then utilized to create TF-IDF vectors for documents, enabling the calculation of document similarities using techniques like cosine similarity. The resulting similarity matrix quantifies the relationships between documents based on their content.
print('Unique Articles in df_content:', df_content.article_id.nunique())
print('Unique Articles in df:', df.article_id.nunique())
print('How many article IDs overlap', len(np.intersect1d(df_content['article_id'].unique(), df['article_id'].unique())))
print('Total Combined Articles:', len(set(list(df.article_id.unique())+list(df_content.article_id.unique()))))
print('Note: 277 of 714 articles are not found in df_content')
Note: This step involves the amalgamation of all articles from both df_content and df to create a comprehensive dataset for building the TD-IDF matrix folloerf by cosine similarity among articles. Additionally, we compute article-level statistics, which will play a crucial role in fine-tuning recommendations. When users request popular articles, the user engagement metric will guide the identification of widely viewed articles.
# STEP 1 - Combine All Articles - Viewed or Unseen
#-----------------------------------------------------------------------------------------
# Create a Article Meta DF - combine df_content and aggregated df
# Includes viewed articles and those in the content dataset
# Combine all metrics under article_meta
article_meta = df.groupby(['article_id','title']).agg({'user_id': ['nunique', 'count']}).reset_index(drop=False)
article_meta.columns = ['article_id','title','unique_user_views', 'total_user_views']
article_meta['article_id'] = article_meta['article_id'].astype(int)
article_meta = pd.merge(df_content[['article_id','doc_full_name']], article_meta, on='article_id', how='outer')
article_meta['viewed'] = np.where(article_meta['unique_user_views'].isnull(), 0,1)
article_meta.fillna(0, inplace=True)
article_meta['article_title'] = np.where(article_meta['doc_full_name'] != 0, article_meta['doc_full_name'], article_meta['title'])
article_meta['rank'] = article_meta['unique_user_views'].rank(ascending=False, method='dense')
print('Combined Unique Articles:', article_meta['article_title'].nunique(),article_meta['article_id'].nunique())
# Display the results
display(article_meta.groupby('viewed')[['unique_user_views','total_user_views']].describe().T)
article_meta.sort_values('rank', ascending=True).head(10)
# distribution of user engagement scores
viewed = article_meta[article_meta['viewed']==1]
trace =go.Histogram(
x=viewed['unique_user_views'],
opacity=0.75,
name='User Views',
histnorm = True,
#xbins=dict( # bins used for histogram
# start=0,
# end=threshold,
# size=5 ),
marker= dict(color='purple')
)
layout = go.Layout(
height = 400,
width = 600,
title ='User Engagement',
titlefont = dict(size=13),
margin=dict(l=50, r=30, t=40, b=50),
bargap=0.1,
yaxis = {
'title': 'Frequency',
'titlefont': dict(size=12),
'tickfont' : dict(size=10),
'gridwidth': None
},
xaxis = {
'title': '',
'titlefont': dict(size=10)
},
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
# STEP 2: Construct TD-IDF matrix
#-----------------------------------------------------------------------------------------
# Function to tokenize article titles
def customtokenize(text):
"""
This function performs custom text tokenization by normalizing text, removing specified patterns,
removing stopwords, tokenizing, and lemmatizing the text.
Parameters:
text (str): The input text to be tokenized.
Returns:
list of str: List of tokenized and lemmatized words.
"""
# normalize text
text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
text = re.sub(r'(?:\b\d+\b)', ' ', text)
#text = re.sub(r'\s\d+(\s\d+)*\s', ' ', text)
# stopword list
stop_words = stopwords.words("english")
# tokenize
words = word_tokenize(text)
# lemmatize
words_lemmed = [WordNetLemmatizer().lemmatize(w).strip() for w in words if w not in stop_words]
return words_lemmed
def tfidf_similarity(df: pd.DataFrame, title):
"""
Calculate TF-IDF similarity matrix for a given DataFrame and text column.
Parameters:
- df (pd.DataFrame): Input DataFrame containing text data.
- title (str): Name of the column in df that contains the text data.
Returns:
- np.ndarray: Similarity matrix representing the pairwise cosine similarity
between documents in the input DataFrame based on TF-IDF vectorization.
Example:
```
similarity_matrix = tfidf_similarity(my_dataframe, 'title_column')
```
"""
corpus = list(df[title].unique())
#print('Corpus size:', len(corpus))
vectorizer = TfidfVectorizer(tokenizer=customtokenize, ngram_range=(1, 3), min_df=3, use_idf=True )
tfidf_matrix = vectorizer.fit_transform(corpus)
#print('Number of Features:', len(vectorizer.get_feature_names()), tfidf_matrix.shape)
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
#print('TD_IDF Similarity:', similarity.shape)
return similarity
def get_article_meta(article_ids: list, df=article_meta):
'''
INPUT:
article_ids - (list) a list of article ids
df - (pandas dataframe) df as defined at the top of the notebook
OUTPUT:
article_names - (list) a list of article names associated with the list of article ids
(this is identified by the title column)
'''
article_names = list(df[df['article_id'].isin(article_ids)]['article_title'].unique())
#meta = df[df['article_id'].isin(article_ids)]['doc_full_name']
return article_names # Return the article names associated with list of article ids
# STEP 3: Derive Cosine Similarity
#-----------------------------------------------------------------------------------------
articles_sim = tfidf_similarity(article_meta, 'article_title')
display(articles_sim[:2])
print(np.mean(articles_sim), np.max(articles_sim), np.percentile(articles_sim,0.5))
articles_sim_df = pd.DataFrame(articles_sim, columns = article_meta['article_id'].tolist(), index = article_meta['article_id'].tolist())
display(articles_sim_df.head())
# STEP 4: Find similar articles based on titles
#-----------------------------------------------------------------------------------------
aid = 0
print('Article Being Compared:', get_article_meta([aid])[0])
print()
print('Similar Articles:')
for id, sim in articles_sim_df.loc[aid].items():
if sim >= 0.25 and int(id) != aid:
print(get_article_meta([int(id)])[0].title())
Heatmap of TD-IDF Similary Matrix
The heatmap below illustrates the similarity strength among articles, using a color-coded system where green dots indicate high similarity and grey dots signify low similarity. Approximately 5K article pairs exhibit similarity scores of 0.6 or higher.
# Create a copy to avoid modifying the original matrix
matrix_copy = np.copy(articles_sim)
# Set the diagonal elements to a lower value (e.g., -np.inf)
np.fill_diagonal(matrix_copy, -np.inf)
# Find the maximum value in the matrix, excluding the diagonal
max_score = np.max(matrix_copy)
# Find the indices of the maximum score
max_indices = np.argwhere(matrix_copy >= 0.6)
# Print results
print("Maximum Score (excluding diagonal):", max_score)
print("Indices of Maximum Score (excluding diagonal):", len(max_indices))
# lets look at a heatmap of similarity scores
custom_colorscale = [[0, 'lightgrey'], [0.5, 'green'], [1, 'red']]
trace = go.Heatmap(
x = np.arange(0,714),
y = np.arange(0,714),
z = articles_sim,
type = 'heatmap',
colorscale = custom_colorscale
)
layout = go.Layout(
width = 750,
height = 500,
title = 'Cosine Similarity using TD-IDF'
)
fig = go.Figure(data = [trace], layout=layout)
iplot(fig)